import besca as bc
import pandas as pd
import pkg_resources
import os
The datasets that are already annotated and should be used for training. If you only use one dataset please use list of one.
adata_trains = [bc.datasets.Haber2017_processed()]
The dataset of interest that should be annotated.
adata_pred = bc.datasets.Smillie2019_processed()
adata_orig = bc.datasets.Smillie2019_processed()
Define level of dblabel reference annotation
level = 2
Give your analysis a name.
analysis_name = 'auto_annot_Smillie2019_with_Haber2017_dblabel_l' + str(level)
Specify column name of celltype annotation you want to train on.
celltype_train ='dblabel_l' + str(level)
celltype_test = 'dblabel_l' + str(level)
Choose a method:
method = 'logistic_regression_elastic' # 'logistic_regression'
Specify merge method if using multiple training datasets. Needs to be either scanorama or naive.
merge = 'scanorama'
Decide if you want to use the raw format or highly variable genes. Raw increases computational time and does not necessarily improve predictions.
use_raw = False
You can choose to only consider a subset of genes from a signature set.
genes_to_use = 'all'
new_cnames = bc.tl.sig.obtain_new_label(
nomenclature_file=pkg_resources.resource_filename('besca', 'datasets/nomenclature/CellTypes_v1.tsv'),
cnames=list(adata_trains[0].obs['dblabel'].cat.categories),
reference_label='dblabel',
new_label='dblabel',
new_level=level)
new_cnames
| new_label | |
|---|---|
| goblet cell | glandular epithelial cell |
| proliferating epithelial fate stem cell | stem cell |
| enterocyte | endo-epithelial cell |
| epithelial fate stem cell | stem cell |
| immature goblet cell | glandular epithelial cell |
| brush cell | endo-epithelial cell |
| proliferating transit amplifying cell | endo-epithelial cell |
| paneth cell | endo-epithelial cell |
| transit amplifying cell | endo-epithelial cell |
| proliferating enterocyte progenitor | endo-epithelial cell |
| immature enterocyte | endo-epithelial cell |
| enteroendocrine cell | glandular epithelial cell |
| enterocyte progenitor | endo-epithelial cell |
adata_trains[0].obs['dblabel_l' + str(level)] = bc.tl.sig.add_anno(adata_trains[0], new_cnames, 'new_label', 'dblabel')
new_cnames = bc.tl.sig.obtain_new_label(
nomenclature_file=pkg_resources.resource_filename('besca', 'datasets/nomenclature/CellTypes_v1.tsv'),
cnames=list(adata_pred.obs['dblabel'].cat.categories),
reference_label='dblabel',
new_label='dblabel',
new_level=level)
new_cnames
| new_label | |
|---|---|
| CD1c-positive myeloid dendritic cell | myeloid leukocyte |
| CD4-positive, alpha-beta memory T cell | lymphocyte |
| CD8-positive, alpha-beta T cell | lymphocyte |
| CD8-positive, alpha-beta cytokine secreting effector T cell | lymphocyte |
| CD141-positive myeloid dendritic cell | myeloid leukocyte |
| HEV endothelial cell | endothelial cell of vascular tree |
| activated CD4-positive, alpha-beta T cell | lymphocyte |
| brush cell | endo-epithelial cell |
| endothelial cell | endothelial cell |
| enterocyte | endo-epithelial cell |
| enterocyte progenitor | endo-epithelial cell |
| enteroendocrine cell | glandular epithelial cell |
| exhausted-like CD4-positive, alpha-beta T cell | lymphocyte |
| fibroblast | fibroblast |
| follicular B cell | lymphocyte of B lineage |
| germinal center B cell | lymphocyte of B lineage |
| glial cell | glial cell |
| goblet cell | glandular epithelial cell |
| immature enterocyte | endo-epithelial cell |
| immature goblet cell | glandular epithelial cell |
| inflammatory fibroblast | fibroblast |
| inflammatory monocyte | myeloid leukocyte |
| innate lymphoid cell | lymphocyte |
| macrophage | myeloid leukocyte |
| mast cell | myeloid leukocyte |
| microfold cell | endo-epithelial cell |
| microvascular endothelial cell | endothelial cell of vascular tree |
| myofibroblast cell | myofibroblast cell |
| natural killer cell | lymphocyte |
| pericyte cell | pericyte cell |
| plasma cell | lymphocyte of B lineage |
| proliferating B cell | lymphocyte of B lineage |
| proliferating T cell | lymphocyte |
| proliferating monocyte | myeloid leukocyte |
| proliferating transit amplifying cell | endo-epithelial cell |
| regulatory T cell | lymphocyte |
| stem cell | stem cell |
| transit amplifying cell | endo-epithelial cell |
adata_pred.obs['dblabel_l' + str(level)] = bc.tl.sig.add_anno(adata_pred, new_cnames, 'new_label', 'dblabel')
adata_orig.obs['dblabel_l' + str(level)] = adata_pred.obs['dblabel_l' + str(level)]
# Select epithelial subset from Smillie2019 dataset
epithelial_subset = bc.subset_adata(adata_orig, adata_orig.obs.celltype_highlevel == 'Epi', raw=False)
adata_orig = epithelial_subset
# Convert mouse symbols (HGNC) to human symbols (MGI)
mousehuman_file = pkg_resources.resource_filename('besca', 'datasets/homologs/MGItoHGNC.csv')
mousehuman=pd.read_csv(mousehuman_file,sep='\t',header='infer', encoding="unicode_escape")
mousehuman.index=mousehuman['HGNC']
conversion=pd.Series(data=mousehuman['MGI'], index=mousehuman.index)
# Convert mouse symbols (HGNC) to human symbols (MGI)
adata_orig.var.rename(columns={'SYMBOL':'HGNC'}, inplace=True)
adata_orig.var['SYMBOL'] = adata_orig.var['HGNC'].map(lambda x: conversion.get(x, default='') if type(conversion.get(x, default='')) == str else str(conversion.get(x, default=None).values[0]))
adata_orig.var.index = adata_orig.var.SYMBOL
adata_orig.var_names_make_unique(join='abcdef')
adata_pred = adata_orig.copy()
This function merges training datasets, removes unwanted genes, and if scanorama is used corrects for datasets.
adata_train, adata_pred = bc.tl.auto_annot.merge_data(adata_trains, adata_pred, genes_to_use = genes_to_use, merge = merge)
merging with scanorama using scanorama rn Found 270 genes among all datasets [[0. 0.76642805] [0. 0. ]] Processing datasets (0, 1) integrating training set calculating intersection
The returned scaler is fitted on the training dataset (to zero mean and scaled to unit variance).
classifier, scaler = bc.tl.auto_annot.fit(adata_train, method, celltype_train)
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
convergence after 12 epochs took 1 seconds convergence after 13 epochs took 1 seconds convergence after 14 epochs took 1 seconds convergence after 15 epochs took 2 seconds convergence after 15 epochs took 2 seconds convergence after 15 epochs took 2 seconds convergence after 45 epochs took 4 seconds convergence after 43 epochs took 4 seconds convergence after 47 epochs took 4 seconds max_iter reached after 9 seconds max_iter reached after 10 seconds max_iter reached after 10 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 11 seconds max_iter reached after 10 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds max_iter reached after 9 seconds
[Parallel(n_jobs=10)]: Done 3 out of 3 | elapsed: 1.2min finished
Use fitted model to predict celltypes in adata_pred. Prediction will be added in a new column called 'auto_annot'. Paths are needed as adata_pred will revert to its original state (all genes, no additional corrections). The threshold should be set to 0 or left out for SVM. For logisitic regression the threshold can be set.
adata_predicted = bc.tl.auto_annot.adata_predict(classifier = classifier, scaler = scaler, adata_pred = adata_pred, adata_orig = adata_orig, threshold = 0)
Write out metrics to a report file, create confusion matrices and comparative umap plots
adata_pred.obs
| CELL | Cluster | Health | Location | Subject | celltype_highlevel | nGene | nUMI | original_name | percent_mito | n_counts | n_genes | batch | leiden | dblabel | celltype | cluster_celltype | Type | dblabel_l2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N7.EpiA.AAGCAAGAGTCAAC-Epi | Cycling TA | Non-inflamed | Epi | N7 | Epi | 1507 | 7428 | N7.EpiA.AAGCAAGAGTCAAC | 0.057351 | 7428.0 | 1507 | N7 | 8 | proliferating transit amplifying cell | epithelial cell | 8: epithelial cell | Epithelial | endo-epithelial cell |
| 1 | N7.EpiA.ACGAGGGAGCTGAT-Epi | Enterocyte Progenitors | Non-inflamed | Epi | N7 | Epi | 828 | 2877 | N7.EpiA.ACGAGGGAGCTGAT | 0.009037 | 2877.0 | 828 | N7 | 0 | enterocyte progenitor | epithelial cell | 0: epithelial cell | Epithelial | endo-epithelial cell |
| 2 | N7.EpiA.ACGTTTACTGGTAC-Epi | Immature Enterocytes 2 | Non-inflamed | Epi | N7 | Epi | 2318 | 15332 | N7.EpiA.ACGTTTACTGGTAC | 0.133707 | 15332.0 | 2318 | N7 | 7 | immature enterocyte | enterocyte | 7: enterocyte | Epithelial | endo-epithelial cell |
| 3 | N7.EpiA.AGAGAATGGTCATG-Epi | Enterocyte Progenitors | Non-inflamed | Epi | N7 | Epi | 884 | 3498 | N7.EpiA.AGAGAATGGTCATG | 0.002001 | 3498.0 | 884 | N7 | 7 | enterocyte progenitor | enterocyte | 7: enterocyte | Epithelial | endo-epithelial cell |
| 4 | N7.EpiA.AGAGCGGAGTATGC-Epi | TA 1 | Non-inflamed | Epi | N7 | Epi | 858 | 3261 | N7.EpiA.AGAGCGGAGTATGC | 0.003067 | 3261.0 | 858 | N7 | 0 | transit amplifying cell | epithelial cell | 0: epithelial cell | Epithelial | endo-epithelial cell |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 46097 | N110.LPB.TTTCCTCCATTCTTAC-Epi | Goblet | Inflamed | LP | N110 | Epi | 1266 | 4944 | N110.LPB.TTTCCTCCATTCTTAC | 0.195793 | 4943.0 | 1266 | N110 | 6 | goblet cell | goblet cell | 6: goblet cell | Epithelial | glandular epithelial cell |
| 46098 | N110.LPB.TTTCCTCTCTGATTCT-Epi | Cycling TA | Inflamed | LP | N110 | Epi | 3826 | 15878 | N110.LPB.TTTCCTCTCTGATTCT | 0.100642 | 15877.0 | 3826 | N110 | 8 | proliferating transit amplifying cell | epithelial cell | 8: epithelial cell | Epithelial | endo-epithelial cell |
| 46099 | N110.LPB.TTTGCGCTCCAGAAGG-Epi | Cycling TA | Inflamed | LP | N110 | Epi | 3275 | 12559 | N110.LPB.TTTGCGCTCCAGAAGG | 0.079385 | 12559.0 | 3275 | N110 | 8 | proliferating transit amplifying cell | epithelial cell | 8: epithelial cell | Epithelial | endo-epithelial cell |
| 46100 | N110.LPB.TTTGGTTGTGTGGCTC-Epi | Immature Enterocytes 2 | Inflamed | LP | N110 | Epi | 2553 | 11705 | N110.LPB.TTTGGTTGTGTGGCTC | 0.130115 | 11705.0 | 2553 | N110 | 7 | immature enterocyte | enterocyte | 7: enterocyte | Epithelial | endo-epithelial cell |
| 46101 | N110.LPB.TTTGGTTTCCTTAATC-Epi | TA 2 | Inflamed | LP | N110 | Epi | 3234 | 16164 | N110.LPB.TTTGGTTTCCTTAATC | 0.118721 | 16164.0 | 3234 | N110 | 0 | transit amplifying cell | epithelial cell | 0: epithelial cell | Epithelial | endo-epithelial cell |
46102 rows × 19 columns
%matplotlib inline
bc.tl.report(
adata_pred=adata_predicted,
celltype=celltype_test,
method=method,
analysis_name=analysis_name,
train_datasets = adata_trains,
test_dataset = adata_orig,
merge = merge,
name_prediction='auto_annot',
name_report='auto_annot',
use_raw=use_raw,
remove_nonshared=True,
clustering='leiden',
asymmetric_matrix=True,
delimiter='\t',
verbose=True
)
acc: 0.6
f1: 0.66
ami: 0.21
ari: 0.13
silhouette dblabel_l2: -0.08
silhouette auto_annot: 0.11
pair confusion matrix:
0 1
0 431786038 123133044
1 929753180 640676040
... storing 'auto_annot' as categorical ... storing 'SYMBOL' as categorical
WARNING: saving figure to file figures/umap.ondata_auto_annot_Smillie2019_with_Haber2017_dblabel_l2.png
WARNING: saving figure to file figures/umap.auto_annot_Smillie2019_with_Haber2017_dblabel_l2_dblabel_l2.png
WARNING: saving figure to file figures/umap.auto_annot_Smillie2019_with_Haber2017_dblabel_l2_auto_annot.png
WARNING: saving figure to file figures/umap.auto_annot_Smillie2019_with_Haber2017_dblabel_l2_leiden.png
import scanpy as sc
sc.pl.umap(adata_predicted, color=[celltype_test, 'auto_annot'])
sc.pl.umap(adata_predicted, color=[celltype_test, 'auto_annot'], legend_loc='on data', legend_fontsize=8)
adata_train
View of AnnData object with n_obs × n_vars = 10896 × 270
obs: 'CELL', 'CONDITION', 'sample_type', 'donor', 'region_x', 'sample', 'percent_mito', 'n_counts', 'n_genes', 'batch', 'leiden', 'celltype0', 'celltype1', 'celltype2', 'celltype3', 'dblabel', 'barcode', 'region_y', 'cell_label', '_merge', 'dblabel_l2'
var: 'SYMBOL', 'ENSEMBL-0', 'n_cells-0', 'total_counts-0', 'frac_reads-0', 'highly_variable-0', 'means-0', 'dispersions-0', 'dispersions_norm-0', 'mean-0', 'std-0', 'ENSEMBL-1', 'n_cells-1', 'total_counts-1', 'frac_reads-1', 'HGNC-1'
uns: 'celltype0_colors', 'celltype1_colors', 'celltype2_colors', 'celltype3_colors', 'hvg', 'leiden', 'leiden_colors', 'rank_genes', 'region_colors', 'umap'
obsm: 'X_pca', 'X_tsne', 'X_umap', 'X_scanorama'
adata_predicted_wo_unknown = adata_predicted.copy()
adata_predicted_wo_unknown = bc.subset_adata(adata_predicted_wo_unknown, adata_predicted_wo_unknown.obs.auto_annot != 'unknown', raw=False)
bc.pl.riverplot_2categories(adata_predicted_wo_unknown, [celltype_test, 'auto_annot'])
# Compare to random assignment
import random
random.seed(1)
adata_predicted.obs['random_labeling'] = list(adata_predicted.obs[celltype_test].sample(frac=1))
bc.tl.report(
adata_pred=adata_predicted,
celltype=celltype_test,
method="compare_to_random_" + method,
analysis_name=analysis_name,
train_datasets = adata_trains,
test_dataset = adata_orig,
merge = merge,
name_prediction="random_labeling",
name_report="compare_to_random_auto_annot",
use_raw=use_raw,
remove_nonshared=False,
clustering='leiden',
asymmetric_matrix=True,
delimiter='\t',
verbose=True)
acc: 0.74
f1: 0.74
ami: 0.0
ari: 0.0
silhouette dblabel_l2: -0.08
silhouette random_labeling: -0.01
pair confusion matrix:
0 1
0 146545384 408373698
1 408373698 1162055522
... storing 'random_labeling' as categorical
WARNING: saving figure to file figures/umap.ondata_auto_annot_Smillie2019_with_Haber2017_dblabel_l2.png
WARNING: saving figure to file figures/umap.auto_annot_Smillie2019_with_Haber2017_dblabel_l2_dblabel_l2.png
WARNING: saving figure to file figures/umap.auto_annot_Smillie2019_with_Haber2017_dblabel_l2_random_labeling.png
WARNING: saving figure to file figures/umap.auto_annot_Smillie2019_with_Haber2017_dblabel_l2_leiden.png
from sinfo import sinfo
sinfo()
----- anndata 0.7.5 besca 2.4+57.g5ad53b2 pandas 1.2.2 pkg_resources NA plotly 4.14.3 scanpy 1.6.1 sinfo 0.3.1 sklearn 0.24.1 ----- IPython 7.20.0 jupyter_client 6.1.11 jupyter_core 4.7.1 notebook 6.2.0 ----- Python 3.7.9 | packaged by conda-forge | (default, Dec 9 2020, 21:08:20) [GCC 9.3.0] Linux-3.10.0-693.11.6.el7.x86_64-x86_64-with-centos-7.4.1708-Core 24 logical CPU cores, x86_64 ----- Session information updated at 2021-07-18 09:22
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')
nb_name = os.path.join(os.getcwd(), nb_name)
! jupyter nbconvert --to html {nb_name}
[NbConvertApp] Converting notebook /pstore/data/bioinfo/users/hatjek/devel/besca_publication_results/intestine/auto_annot/auto_annot_Smillie2019_with_Haber2017_dblabel_l2.ipynb to html [NbConvertApp] Writing 6310941 bytes to /pstore/data/bioinfo/users/hatjek/devel/besca_publication_results/intestine/auto_annot/auto_annot_Smillie2019_with_Haber2017_dblabel_l2.html